A group of customers were given an offer in person that they can get a loan at discounted rate and processing fee will be waived off. A pilot campaign was conducted to get response from customers whether they are interested in taking out a loan or not. Response was recorded and data was collected.
Variables involved: Customer_id, Age, Gender, Balance, Occupation, No of Credit transaction, SCR, Holding period
Bucketed
Final visualization
age_d == 1,2,3,4 vs Target, SCR , _Balance
Understanding Variables¶
Holding Period (How long the customer is able to hold the money in his account.. So, if they have some existing expenses like a loan EMI or any other monthly expense which gets deducted, usually the first week of every month, hence it makes the balance in the account lower during initial days of the month itself.Higher the holding period, more stable their money is in the account.)
SCR SCR is a score given to a customer for a particular product ( in this case loan ) based on certain parameters, to know whether how likely that customer is to buy that product.. so, higher the score, higher the probability, the customer will buy it.. ###SCR propensity of a customer to respond to a digital marketing
Now All Models measure recall on same testing data
Fixed Sampling mistake
Redefined print_classification_report as classification_report for better clarity and ease of use
Visualized Decision Trees
Implemented SVC
Implemented KNN which provided great results with default parameters
Fit Random Forest Models
Fit XgBoost Models
Added cross validation
Added ROC plots
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, roc_curve, auc, plot_roc_curve
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import plot_partial_dependence
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE
import xgboost as xgb
#from pandasgui import show
data = pd.read_csv('Model_data.csv')
#show(data)
data.Balance = data.Balance.astype('int32') #Truncating decimals
data.head()
data.Balance.describe()
data.shape
data.info()
Gender and Occupation are categorical varibles stored as object type
EDA
No Strong correlations measured except for mild ones in Holding_period and other variables
sns.heatmap(data.corr(), annot=True, square=True) # No strong correlations seen overall
plt.show()
sns.pairplot(data, hue='Target')
plt.show()
sns.countplot(x = data.Gender)
plt.show()
data.Gender.unique()
data.Occupation.unique()
data.Gender.value_counts()
data.drop(data.Gender[data.Gender== 'O'].index, axis = 0, inplace= True) # Removed 196 rows with `Gender` = 'O'
data.shape
data.Balance.describe()
# sns.histplot(data.Age)
sns.countplot(x = data.Occupation)
sns.countplot(x=data.Target, hue=data.Occupation) ## Self employed are much more likely to take loans
g = sns.FacetGrid(data, col='Occupation', hue="Gender")
plt.grid(True)
g.map(sns.countplot, "Gender", alpha=1)
g.add_legend()
plt.grid((False))
# sns.histplot(data.No_OF_CR_TXNS)
data.No_OF_CR_TXNS.describe()
sns.violinplot(x=data.No_OF_CR_TXNS)
plt.grid(True)
# len(data[data.No_OF_CR_TXNS==0].index)
# data.drop(index=data[data.No_OF_CR_TXNS==0].index, axis=0)
# sns.displot(data.SCR, kind = 'kde')
sns.distplot(data.SCR)
plt.show()
data.SCR.describe()
# sns.histplot(data.Holding_Period)
data3 = data
Create a function for easy report printing¶
# A class for pretty printing
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
# function for validation on test data
def classification_report(y_true, y_prediction, type_of_data='Enter Over/Under/Original sampled', type_of_classifier='ClassifierName'):
"""Print Classification report"""
accuracy = accuracy_score(y_true, y_prediction)
precision = precision_score(y_true, y_prediction)
recall = recall_score(y_true, y_prediction)
f1 = f1_score(y_true, y_prediction)
print('Classification Report on Testing Data:\n'+ color.BOLD + type_of_data, 'data\n'+color.END+color.RED+color.BOLD+type_of_classifier,'Classifier'+color.END+color.END)
print()
print('---------------------------------------')
print(color.BOLD + 'Recall: %s' %recall + color.END)
print('Precision: %s' %precision)
print('F1 score: %s' %f1)
print('Accuracy: %s' %accuracy)
print('---------------------------------------')
print()
# A function for cross-validation report
def cross_val_report(classifier, train_data, train_label, cv=10, scoring=['recall','precision', 'f1','accuracy']):
score = cross_validate(classifier, train_data, train_label, cv=cv, scoring= scoring)
recall = np.mean(score['test_recall'])
precision = np.mean(score['test_precision'])
f1 = np.mean(score['test_f1'])
accuracy= np.mean(score['test_accuracy'])
print('Cross Validation Report')
print(color.BOLD + 'Recall: %s' %recall + color.END)
print('Precision: %s' %precision)
print('F1: %s' %f1)
print('Accuracy: %s' %accuracy)
print()
print("*Mean values presented")
print('---------------------------------------')
Create the first set of training and test data on imbalanced data
df = pd.get_dummies(data, columns=['Gender','Occupation'], drop_first = True)
df.head()
Creating a model with Original Unbalanced data and measuring metrics
X_original = df.iloc[:,1:]
y_original = df.iloc[:,0]
X_original
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X_original,y_original, shuffle = ['True'], stratify=y_original)
clf = DecisionTreeClassifier(max_depth = 5)
clf.fit(X_train_orig, y_train_orig)
y_prediction_orig = clf.predict(X_test_orig)
classification_report(y_test_orig, y_prediction_orig, 'Original', 'Decision Tree')
plot_roc_curve(clf, X_test_orig, y_test_orig)
plt.show()
# cross_val_report(clf, y_test_orig,y_under_prediction.reshape(1,-1))
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_orig, y_prediction_orig)
# fig = plt.figure(figsize=(50,20))
# _ = plot_tree(clf,
# feature_names=list(X_original.columns),
# class_names=['0','1'],
# filled=True, fontsize=10)
Create undersampled data and fit a model
X_under_train, y_under_train = NearMiss().fit_resample(X_train_orig, y_train_orig)
data[data.Target==1].shape
X_under_train.shape, y_under_train.shape
clf_under_sampled = DecisionTreeClassifier(max_depth = 5)
clf_under_sampled.fit(X_under_train, y_under_train)
y_under_prediction = clf_under_sampled.predict(X_test_orig)
classification_report(y_test_orig,y_under_prediction, 'Undersampled', 'Decision Tree')
plot_roc_curve(clf_under_sampled, X_test_orig, y_test_orig)
# cross_val_report(clf_under_sampled, y_test_orig,y_under_prediction)
## crossval here causes unbalanced split
# fig = plt.figure(figsize=(100,100))
# _ = plot_tree(clf_under_sampled,
# feature_names=list(X_original.columns),
# class_names=['0','1'],
# filled=True, fontsize=10)
Model on an oversampled dataset
X_over_train, y_over_train = SMOTE().fit_resample(X_original, y_original)
clf_over_sampled = DecisionTreeClassifier(max_depth = 5)
clf_over_sampled.fit(X_over_train, y_over_train)
y_over_predict = clf_over_sampled.predict(X_test_orig)
classification_report(y_test_orig, y_over_predict, 'Oversampled', 'Decision Tree')
plot_roc_curve(clf_over_sampled, X_test_orig, y_test_orig)
# fig = plt.figure(figsize=(100,100))
# _ = plot_tree(clf_over_sampled,
# feature_names=list(X_original.columns),
# class_names=['0','1'],
# filled=True, fontsize=10)
print("Original: "+color.BOLD+ "X_original,y_original"+color.END+":: X_train_orig, X_test_orig, y_train_orig, y_test_orig")
print()
print("Undersampled:"+color.BOLD+ " X_under, y_under"+color.END+" :: X_under_train, y_under_train")
print()
print("Oversampled:"+color.BOLD+ " X_over, y_over"+color.END+" :: X_over_train, y_over_train")
The above datasets can be better sampled by adjusting hyper-parameters of NearMiss and SMOTE, or other methods of sampling could be used
SVC fails to fit on original dataset, possibly because of unbalance
clf_svc0 = SVC()
clf_svc0.fit(X_under_train, y_under_train)
y_predict = clf_svc0.predict(X_test_orig)
classification_report(y_test_orig, y_predict, 'Undersampled', 'SVM')
plot_roc_curve(clf_svc0, X_test_orig, y_test_orig)
cross_val_report(clf_svc0, X_under_train, y_under_train)
# %%time
# # Will take LONG Time for Training
# clf_svc1 = SVC()
# clf_svc1.fit(X_over_train, y_over_train)
# y_predict = clf_svc1.predict(X_test_orig)
# classification_report(y_test_orig, y_predict, 'Oversampled', 'SVM')
# plot_roc_curve(clf_svc1, X_test_orig, y_test_orig)
# cross_val_report(clf_svc1, X_under_train, y_under_train)
clf_KNN0 = KNeighborsClassifier()
clf_KNN0.fit(X_train_orig, y_train_orig)
y_predict= clf_KNN0.predict(X_test_orig)
classification_report(y_test_orig, y_predict, 'Original', 'KNN')
plot_roc_curve(clf_KNN0, X_test_orig, y_test_orig)
cross_val_report(clf_KNN0, X_under_train, y_under_train)
clf_KNN1 = KNeighborsClassifier()
clf_KNN1.fit(X_under_train, y_under_train)
y_predict= clf_KNN1.predict(X_test_orig)
classification_report(y_test_orig, y_predict, 'Undersampled', 'KNN')
plot_roc_curve(clf_KNN1, X_test_orig, y_test_orig)
cross_val_report(clf_KNN1, X_under_train, y_under_train)
clf_rf0 = RandomForestClassifier()
clf_rf0.fit(X_train_orig, y_train_orig)
y_predict= clf_rf0.predict(X_test_orig)
classification_report(y_test_orig, y_predict, 'Original', 'Random Forest')
plot_roc_curve(clf_rf0, X_test_orig, y_test_orig)
cross_val_report(clf_rf0, X_under_train, y_under_train)
plot_partial_dependence(clf_rf0, X_test_orig, X_test_orig.columns)
(pd.Series(clf_rf0.feature_importances_, index=X_test_orig.columns)
.nlargest(4)
.plot(kind='bar'))
from sklearn.feature_selection import RFE
rfe = RFE(RandomForestClassifier(), n_features_to_select=4)
rfe = rfe.fit(X_over_train, y_over_train)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)
categorical_features_indices = np.where(X_train_orig.dtypes != np.float)[0]
#importing library and building model
from catboost import CatBoostRegressor
model=CatBoostRegressor(iterations=50, depth=3, learning_rate=0.1, loss_function='RMSE')
model.fit(X_train_orig, y_train_orig,cat_features=categorical_features_indices,eval_set=(X_test_orig, y_test_orig),plot=True)
shap.summary_plot(shap_values, X_test_orig)
help(CatBoostClassifier)
X_train_orig.columns
clf_rf1 = RandomForestClassifier()
clf_rf1.fit(X_under_train, y_under_train)
y_predict= clf_rf1.predict(X_test_orig)
classification_report(y_test_orig, y_predict, 'Undersampled', 'Random Forest')
plot_roc_curve(clf_rf1, X_test_orig, y_test_orig)
cross_val_report(clf_rf1, X_under_train, y_under_train)
clf_rf2 = RandomForestClassifier()
clf_rf2.fit(X_under_train, y_under_train)
y_predict= clf_rf2.predict(X_test_orig)
classification_report(y_test_orig, y_predict, 'Oversampled', 'Random Forest')
plot_roc_curve(clf_rf2, X_test_orig, y_test_orig)
cross_val_report(clf_rf2, X_under_train, y_under_train)
(pd.Series(clf_rf2.feature_importances_, index=X_test_orig.columns)
.nlargest(4)
.plot(kind='barh'))
# rfe = RFE(KNeighborsClassifier(), n_features_to_select=4)
# rfe = rfe.fit(X_train_orig, y_train_orig)
# # summarize the selection of the attributes
# print(rfe.saupport_)
# print(rfe.ranking_)
lr0 = LogisticRegression(max_iter=1000)
lr0.fit(X_train_orig, y_train_orig)
y_predict= lr0.predict(X_test_orig)
classification_report(y_test_orig, y_predict, 'Original', 'Logistic Regression')
plot_roc_curve(lr0, X_test_orig, y_test_orig)
cross_val_report(lr0, X_under_train, y_under_train)
lr1 = LogisticRegression(max_iter=100)
lr1.fit(X_under_train, y_under_train)
y_predict= lr1.predict(X_test_orig)
classification_report(y_test_orig, y_predict, 'Undersampled', 'Logistic Regression')
plot_roc_curve(lr1, X_test_orig, y_test_orig)
cross_val_report(lr1, X_under_train, y_under_train)
lr2 = LogisticRegression(max_iter=100)
lr2.fit(X_over_train, y_over_train)
y_predict= lr2.predict(X_test_orig)
classification_report(y_test_orig, y_predict, 'Oversampled', 'Logistic Regression')
plot_roc_curve(lr1, X_test_orig, y_test_orig)
cross_val_report(lr1, X_under_train, y_under_train)
clf_KNN2 = KNeighborsClassifier()
clf_KNN2.fit(X_over_train, y_over_train)
y_predict= clf_KNN2.predict(X_test_orig)
classification_report(y_test_orig, y_predict, 'Oversampled', 'KNN')
plot_roc_curve(clf_KNN2, X_test_orig, y_test_orig)
cross_val_report(clf_KNN2, X_under_train, y_under_train)
Segment the customer with prob >90, 80>prob>90, 70>prob>80, 60>prob>70 and so on
predict_prob = clf_KNN2.predict_proba(X_test_orig)
predict_prob.shape
# print more significant digits here
# predict_prob
predictions = pd.DataFrame(predict_prob, columns=[0, 'PredictionProb'])
# predictions.head()
predictions_prob = predictions.drop(0, axis=1)
y = pd.DataFrame(y_test_orig, columns = ['Target'])
y = y.reset_index(drop=True)
error = y.Target- predictions_prob.PredictionProb
# This series indicate the error, closer to zero values mean better prediction
# positive mean that my model should have predicted loan taker, but did not
# negative means that model should have predicted NOT a loan taker, but did predict as such
# target - probablity
plt.hist(error)
plt.xlabel('Error in Prediction: Target - Probablity Of Prediction')
plt.ylabel("Count")
plt.show()
sns.catplot(x="Occupation", y="SCR", hue = "Target",data=data, kind = "violin", split = True)
plt.show()
sns.relplot(x="Occupation", y="SCR", hue = "Target",data=data, aspect=1.5, kind = "line")
plt.show()
sns.catplot(x="Occupation", y="SCR", hue = "Target",data=data, aspect=2.0, kind = "point")
plt.show()
sns.catplot(x="Occupation", y="SCR", hue = "Target",data=data, kind = "violin", split = True)
plt.show()
The model is good, make few mistakes
data1 = data.copy()
data1['Ageb'] = pd.qcut(data['Age'], q=4)
data1.head()
data1.Ageb.value_counts()
data1.dtypes
data1.Ageb
data1.head(5)
# sns.pairplot(data1, hue='Target')
data1 = pd.get_dummies(data1)
data1.head(2)
data2 = data1.copy()
data1.columns
cols = ['Ageb_(30.0, 38.0]', 'Ageb_(20.999, 30.0]','Ageb_(46.0, 55.0]','Ageb_(38.0, 46.0]']
def func1(x):
if x ==1:
return 1
def func2(x):
if x ==1:
return 2
def func3(x):
if x ==1:
return 3
def func4(x):
if x ==1:
return 4
data1['age_d'] = data1['Ageb_(30.0, 38.0]'].apply(func1)
data1['age_d'] = data1['Ageb_(20.999, 30.0]'].apply(func2)
data1['age_d'] = data1['Ageb_(46.0, 55.0]'].apply(func3)
data1['age_d'] = data1['Ageb_(38.0, 46.0]'].apply(func4)
for ind, row in data1.iterrows():
if row['Ageb_(20.999, 30.0]'] ==1:
data1.loc[ind, 'age_d'] = 1
elif row['Ageb_(30.0, 38.0]'] ==1:
data1.loc[ind, 'age_d'] = 2
elif row['Ageb_(38.0, 46.0]'] ==1:
data1.loc[ind, 'age_d'] = 3
elif row['Ageb_(46.0, 55.0]'] ==1:
data1.loc[ind, 'age_d'] = 4
data1.age_d.value_counts()
data1.age_d = data1.age_d.astype('int32')
data1.head()
sns.catplot(x="age_d", y="SCR", hue = "Target",data=data1, kind = "violin", split = True)
plt.show()
data1.columns
sns.scatterplot(x='age_d', y="SCR", hue = "Target",data=data1,)
plt.show()
sns.catplot(x="age_d", y="SCR", hue = "Target",data=data1, kind = "swarm")
plt.show()
sns.set_style('ticks')
g = sns.relplot(x="SCR", y="age_d", hue = "Target",data=data1, aspect=3, kind = "line")
# g.figure.set_size_inches(18.5, 10.5)
# sns.despine()
data1['SCRb'] = pd.qcut(data['SCR'], q=5)
data1.SCRb.value_counts()
data1['HPB'] = pd.qcut(data['Holding_Period'], q=4)
data1.HPB.value_counts()
data1.head()
sns.catplot(x="age_d", y="SCRb", hue = "Target",data=data1, kind = "violin", split = True)
plt.show()
sns.catplot(x="age_d", y="HPB", hue = "Target",data=data1, kind = "violin", split = True)
plt.show()
sns.pairplot(data1, hue = 'Target')
plt.show()
import shap
X_train_summary = shap.kmeans(X_train_orig, 10)
explainerKNN = shap.KernelExplainer(clf_KNN0.predict,X_train_summary)
shap_values_KNN_test = explainerKNN.shap_values(X_test_orig)
shap.initjs()
shap.force_plot(explainerKNN.expected_value, shap_values_KNN_test[:1000,:], X_test_orig.iloc[:1000,:])
import lime
import lime.lime_tabular
lgb_params = {
'task': 'train',
'boosting_type': 'goss',
'objective': 'binary',
'metric':'binary_logloss',
'metric': {'l2', 'auc'},
'num_leaves': 50,
'learning_rate': 0.1,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'verbose': None,
'num_iteration':100,
'num_threads':7,
'max_depth':12,
'min_data_in_leaf':100,
'alpha':0.5}
import warnings
from lime import submodular_pick
import lightgbm as lgb
lgb_train = lgb.Dataset(X_train_orig, y_train_orig)
lgb_eval = lgb.Dataset(X_test_orig, y_test_orig)
model = lgb.train(lgb_params,lgb_train,num_boost_round=20,valid_sets=lgb_eval,early_stopping_rounds=5)
lime.lime_tabular.LimeTabularExplainer(data[model.feature_name()].astype(int).values,
mode='classification',training_labels=data3['Target'],feature_names=model.feature_name())
# Remember to convert the dataframe to matrix values
# SP-LIME returns exaplanations on a sample set to provide a non redundant global decision boundary of original model
sp_obj = submodular_pick.SubmodularPick(explainer,data3[model.feature_name()].values, \
prob, num_features=5,num_exps_desired=10)
[exp.as_pyplot_figure(label=1) for exp in sp_obj.sp_explanations]
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=1500, learning_rate=0.01, l2_leaf_reg=3.5, depth=5, rsm=0.98,
loss_function= 'Logloss', eval_metric='AUC',use_best_model=True,random_seed=42)
cate_features_index = np.where(data.dtypes != float)[0]
model.fit(X_train_orig,y_train_orig,cat_features=cate_features_index,eval_set=(X_test_orig,y_test_orig))
test = pd.read_csv("F:\\IBPA_IIM_FinalProject\\Test_targets.csv")
pred = model.predict_proba(test.iloc[:,0])